This script takes the raw data downloaded from Crimson Hexagon and cleans it up for analysis. At the bottom of the script there are a few exploratory maps.
knitr::opts_chunk$set(message = F, warning = F)
library(tidyverse)
library(jsonlite)
library(ggmap)
library(leaflet)
library(sf)
library(readxl)
library(reticulate)
library(RColorBrewer)
library(kableExtra)
library(mapview)
Crimson Hexagon data is saved in two day bulk exports. The CH website only allows exports of 10,000 randomly selected tweets. There seemed to be between 10-15k over any 2 day period so data was exported in 2-day chunks to try and get as much data as possible. Two filters were applied to the data before downloading - the location was set to Santa Barbara (this does not mean the tweet was geotagged but that it came from the area) and that it was an Original Tweet (not a retweet).
# list all .xlsx files
xl_files <- list.files("../data/daily", pattern = ".xlsx", full.names = TRUE)
ids <- data.frame()
for(i in 1:length(xl_files)){
print(i)
#get twitter IDs from the Crimson Hexagon output
ch_data <- read_excel(xl_files[i], skip = 1) %>%
select(GUID)
ids <- rbind(ch_data, ids)
}
nums <- seq(1, nrow(ids), length.out = 30)
for(i in 1:29){
n <- nums[i]
n2 <- nums[i+1]
df <- ids[n:n2,]
#save as .txt file to be read by the python twarc library
write.table(as.numeric(df$GUID), file = paste0("../data/twitter_ids_", i, ".txt"), sep = "\t",
row.names = FALSE, col.names = FALSE)
}
Now I use the python library, twarc in my terminal to “hydrate” the data using the tweet IDs. The Crimson Hexagon data does not give us much information but the twarc library lets us use the twitter id to grab a lot more information (including coordinates for geotagged tweets).
Once this is done, all tweets are saved in a JSON file.
# Give the input file name to the function.#
tweets1 <- stream_in(file("../data/tweets1.jsonl"))
tweets2 <- stream_in(file("../data/tweets2.jsonl"))
tweets3 <- stream_in(file("../data/tweets3.jsonl"))
tweets4 <- stream_in(file("../data/tweets4.jsonl"))
tweets5 <- stream_in(file("../data/tweets5.jsonl"))
tweets6 <- stream_in(file("../data/tweets6.jsonl"))
tweets7 <- stream_in(file("../data/tweets7.jsonl"))
tweets8 <- stream_in(file("../data/tweets8.jsonl"))
tweets9 <- stream_in(file("../data/tweets9.jsonl"))
tweets10 <- stream_in(file("../data/tweets10.jsonl"))
tweets11 <- stream_in(file("../data/tweets11.jsonl"))
tweets12 <- stream_in(file("../data/tweets12.jsonl"))
tweets13 <- stream_in(file("../data/tweets13.jsonl"))
tweets14 <- stream_in(file("../data/tweets14.jsonl"))
tweets15 <- stream_in(file("../data/tweets15.jsonl"))
tweets16 <- stream_in(file("../data/tweets16.jsonl"))
tweets17 <- stream_in(file("../data/tweets17.jsonl"))
tweets18 <- stream_in(file("../data/tweets18.jsonl"))
tweets19 <- stream_in(file("../data/tweets19.jsonl"))
tweets20 <- stream_in(file("../data/tweets20.jsonl"))
tweets21 <- stream_in(file("../data/tweets21.jsonl"))
tweets22 <- stream_in(file("../data/tweets22.jsonl"))
tweets23 <- stream_in(file("../data/tweets23.jsonl"))
tweets24 <- stream_in(file("../data/tweets24.jsonl"))
tweets25 <- stream_in(file("../data/tweets25.jsonl"))
tweets26 <- stream_in(file("../data/tweets26.jsonl"))
tweets27 <- stream_in(file("../data/tweets27.jsonl"))
tweets28 <- stream_in(file("../data/tweets28.jsonl"))
tweets29 <- stream_in(file("../data/tweets29.jsonl"))
create_tweet_df <- function(tweets){
#get the columns we want from the json (some are nested)
tweet_df <- as_tibble(cbind(
as.character(tweets$created_at),
as.numeric(tweets$id_str),
as.character(tweets$full_text),
as.numeric(tweets$user$id_str),
as.character(tweets$user$location),
as.character(tweets$geo$type),
as.character(tweets$geo$coordinates),
as.character(tweets$lang),
as.numeric(tweets$retweet_count),
as.numeric(tweets$favorite_count)))
#assign column names
names(tweet_df) <- c("created_at","tweet_id","full_text","user_id","user_location",
"geo_type", "geo_coordinates", "language", "retweet_count", "favorite_count")
## filter
tweets_geo <- tweet_df %>%
filter(!is.na(geo_type)) %>%
mutate(tweet_id = as.numeric(tweet_id),
user_id = as.numeric(user_id),
retweet_count = as.numeric(retweet_count),
favorite_count = as.numeric(favorite_count))
return(tweets_geo)
}
Apply function
df1 <- create_tweet_df(tweets1)
df2 <- create_tweet_df(tweets2)
df3 <- create_tweet_df(tweets3)
df4 <- create_tweet_df(tweets4)
df5 <- create_tweet_df(tweets5)
df6 <- create_tweet_df(tweets6)
df7 <- create_tweet_df(tweets7)
df8 <- create_tweet_df(tweets8)
df9 <- create_tweet_df(tweets9)
df10 <- create_tweet_df(tweets10)
df11 <- create_tweet_df(tweets11)
df12 <- create_tweet_df(tweets12)
df13 <- create_tweet_df(tweets13)
df14 <- create_tweet_df(tweets14)
df15 <- create_tweet_df(tweets15)
df16 <- create_tweet_df(tweets16)
df17 <- create_tweet_df(tweets17)
df18 <- create_tweet_df(tweets18)
df19 <- create_tweet_df(tweets19)
df20 <- create_tweet_df(tweets20)
df21 <- create_tweet_df(tweets21)
df22 <- create_tweet_df(tweets22)
df23 <- create_tweet_df(tweets23)
df24 <- create_tweet_df(tweets24)
df25 <- create_tweet_df(tweets25)
df26 <- create_tweet_df(tweets26)
df27 <- create_tweet_df(tweets27)
df28 <- create_tweet_df(tweets28)
df29 <- create_tweet_df(tweets29)
Combine
all_df <- bind_rows(df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14,df15, df16, df17, df18, df19, df20, df21, df22, df23, df24, df25, df26, df27, df28, df29)
Remove points outside of our bounding box, which is c(-119.9,34.38,-119.5,34.48)
# create new df with just the tweet texts & usernames
tweet_data <- all_df %>%
mutate(coords = gsub("\\)|c\\(", "", geo_coordinates)) %>%
separate(coords, c("lat", "lon"), sep = ", ") %>%
mutate_at(c("lon", "lat"), as.numeric) %>%
filter(lat >=33.88 & lat <= 34.6,
lon <= -119.5 & lon >= -120.5) %>%
separate(created_at, into = c("Day", "Year"), sep = 26) %>%
mutate(Year = as.numeric(Year)) %>%
separate(Day, into = c("Day", "Date"), sep = 4) %>%
separate(Date, into = c("Date", "Time"), sep = 7) %>%
separate(Time, into = c("Time", "Extra"), sep = 9) %>%
select(-Extra, -language, -geo_type, -Day) %>%
separate(Date, into = c("Month", "Day"), sep = " ") %>%
mutate(Day = as.numeric(Day)) %>%
mutate(month_num = match(Month,month.abb)) %>%
mutate(date = as.Date(paste0(month_num, "/", Day, "/",Year), tryFormats = "%m/%d/%Y"))
write_csv(tweet_data, "../data/geotagged_sb_tweets.csv")
#remove tweets from Jan-Apr 2015 because of the Twitter user interface change
tweet_data_sub <- tweet_data %>% filter(date > "2015-04-28")
write_csv(tweet_data_sub, "../data/geotagged_sb_tweets_post_apr_2015.csv")
Turn the tweet_df_w_user_type data frame into a spatial object.
tweet_data <- read_csv("../data/geotagged_sb_tweets_post_apr_2015.csv")
tweet_sf <- tweet_data %>%
st_as_sf(coords = c("lon", "lat")) %>%
st_set_crs(4326)
#map
map <- leaflet(tweet_data) %>%
# Base groups
addProviderTiles(providers$CartoDB.Positron) %>%
# Overlay groups %>%
addCircleMarkers(data = tweet_data, lng = ~lon, lat = ~lat, popup = ~full_text,
radius = 3, stroke = FALSE, fillOpacity = 0.5, clusterOptions = markerClusterOptions())
map
register_google(Sys.getenv("GOOGLE_ACCESS_TOKEN"))
#santa barbara
sb.map <- get_map("santa barbara, california", zoom = 14, maptype = "toner-lite")
ggmap(sb.map, legend="none") +
coord_equal() +
labs(x = NULL, y = NULL) +
theme(axis.text = element_blank()) +
geom_point(data = tweet_data, aes(x = lon, y = lat),
size = 0.55, alpha = 0.2, color = "darkorchid4") +
labs(fill = "User type",
title = "Tweets in downtown Santa Barbara")
ggsave("../figs/all_tweets_sb_downtown.png")
cols = c(brewer.pal(9,"OrRd")[2:9])
ggmap(sb.map, legend="none") +
coord_equal() +
labs(x = NULL, y = NULL) +
theme(axis.text = element_blank()) +
geom_hex(data = tweet_data, aes(x=lon, y=lat, fill = cut(..count.., c(0, 5, 10, 50, 100,
500, 1000, 2500, Inf))), bins=150) +
scale_fill_manual(
values = cols,
labels = c("<5", "5-9", "10-49 ", "50-99 ",
"100-499 ", "500-999 ", "1000-2499 ", "2500+")
) +
labs(fill = "# Tweets",
title = "Tweets in Santa Barbara 2015-2019")
ggsave("../figs/all_tweets_sb_static_hex_map.png")
#santa barbara
sb.map <- get_map("santa barbara, california", zoom = 11, maptype = "toner-lite")
ggmap(sb.map, legend="none") +
coord_equal() +
labs(x = NULL, y = NULL) +
theme(axis.text = element_blank()) +
geom_hex(data = tweet_data, aes(x=lon, y=lat, fill = cut(..count.., c(0, 5, 10, 50, 100,
500, 1000, 2500, Inf))), bins=150) +
scale_fill_manual(
values = cols,
labels = c("<5 ", "5-9", "10-49 ", "50-99 ",
"100-499 ", "500-999 ", "1000-2499 ", "2500+")
) +
labs(fill = "# Tweets",
title = "Tweets in larger SB area 2015-2019")
Get hex density by overlaying with points
hex_grid <- read_sf("../data/sb_area_hexagons.shp") %>%
st_transform(st_crs(tweet_sf))
hex_tweet_count <- hex_grid %>%
mutate(tweet_count = lengths(st_intersects(hex_grid, tweet_sf)))
mapview(hex_tweet_count %>% filter(tweet_count > 0), zcol = "tweet_count", layer.name = "# tweets")
Why are there so many tweets near De La Vina and Arrellaga hospital? Let’s take a closer look at tweets by geo_coordinates
geo_tweets <- tweet_data %>%
group_by(geo_coordinates) %>%
summarize(count = n()) %>%
arrange(desc(count))
head(geo_tweets)
## # A tibble: 6 x 2
## geo_coordinates count
## <chr> <int>
## 1 c(34.4258, -119.714) 11572
## 2 c(34.42, -119.7) 2062
## 3 c(34.4337, -119.632) 913
## 4 c(34.39916667, -119.51638889) 704
## 5 c(34.41938, -119.69905) 666
## 6 c(34.4405, -119.814) 511
So one coordinate has 11,489 tweets from it. The next highest is just 2019 tweets.
sb.zoom.map <- get_map(location = c( -119.7158247, 34.4262342), zoom = 17, maptype = "toner-lite")
ggmap(sb.zoom.map, legend="none") +
coord_equal() +
labs(x = NULL, y = NULL) +
theme(axis.text = element_blank()) +
geom_hex(data = tweet_data, bins = 50)
The light blue point is equal to the coordinates c(34.4258, -119.714). I think this is the default coord when someone tags Santa Barbara. First clue is that there is nothing of significance at this location, it is a residential area. Let’s take a look at a handful of tweets coming from here.
delavina_tweets <- tweet_data %>%
filter(geo_coordinates == "c(34.4258, -119.714)")
kable(sample_n(delavina_tweets, 10)) %>%
kable_styling(bootstrap_options = c("striped", "condensed"), font_size = 10, fixed_thead = T)
| Month | Day | Time | Year | full_text | user_id | user_location | geo_coordinates | retweet_count | favorite_count | lat | lon | month_num | date |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Mar | 18 | 21:02:57 | 2018 | J-R Marathon & Mystical Traveler screening in Santa Barbara, Ca send light to Montecito "For… https://t.co/in4u7fWCeP | 1.554213e+07 | Santa Monica | c(34.4258, -119.714) | 0 | 0 | 34.4258 | -119.714 | 3 | 2018-03-18 |
| Nov | 22 | 18:07:29 | 2017 | Anyone else’s mom ALSO their #bestie?! ❤️ Cannot wait for Elizabeth Leibovitz to arrive this… https://t.co/J8KoX5JUVL | 2.435284e+09 | Denton, TX | c(34.4258, -119.714) | 0 | 0 | 34.4258 | -119.714 | 11 | 2017-11-22 |
| Jan | 11 | 21:33:06 | 2016 | DECCO Champagne Cage chair by Stabiles with Airplant Garden and Lamp. My Official Entry pic😎… https://t.co/7fLn7je3fc | 1.668713e+09 | Santa Barbara, California | c(34.4258, -119.714) | 0 | 0 | 34.4258 | -119.714 | 1 | 2016-01-11 |
| Jan | 31 | 17:28:13 | 2018 | One more from the weekend. @ Santa Barbara, California https://t.co/2GIqbdMqWp | 1.671418e+07 | California | c(34.4258, -119.714) | 0 | 4 | 34.4258 | -119.714 | 1 | 2018-01-31 |
| Jan | 27 | 02:56:47 | 2018 | Don’t ever give up on your dreams. You can do it! #followyourdreams #jacuzzi… https://t.co/V3zslaYCDq | 8.483431e+17 | Guarding the Universe | c(34.4258, -119.714) | 0 | 0 | 34.4258 | -119.714 | 1 | 2018-01-27 |
| Aug | 3 | 20:45:35 | 2018 | Wine tasting…living my best life. 🍷🍷🍷 . . . . . #wine #winelover #santabarbara #roadtrip #saturday #weekend #fbf #flashbackfriday #friends #girlfriends… https://t.co/mchy8fwVat | 2.042687e+07 | Los Angeles | c(34.4258, -119.714) | 2 | 0 | 34.4258 | -119.714 | 8 | 2018-08-03 |
| Jan | 1 | 09:24:07 | 2016 | Because kicking off 2016 dancing on a bench is clearly the best way to start the new year! #nye… https://t.co/dQhJtc3ko6 | 2.807708e+07 | California | c(34.4258, -119.714) | 0 | 0 | 34.4258 | -119.714 | 1 | 2016-01-01 |
| Jul | 14 | 03:04:23 | 2016 | One of those dresses that makes you feel like💃🏼 privacypls | https://t.co/fUflBTb1Oh… https://t.co/5bpbYjfsG2 | 3.383952e+09 | Los Angeles, CA | c(34.4258, -119.714) | 0 | 0 | 34.4258 | -119.714 | 7 | 2016-07-14 |
| Apr | 8 | 15:51:14 | 2019 | Just posted a photo @ Santa Barbara, California https://t.co/rcdLHkGz8o | 1.495026e+07 | San Jose, Berkeley, LA 🗼🌉🏙 | c(34.4258, -119.714) | 0 | 0 | 34.4258 | -119.714 | 4 | 2019-04-08 |
| Jul | 29 | 00:52:59 | 2018 | 🗓My Days JULY/28/2018🗓 🗝" #LOVEAF "🗝 #IAlone #ILoveYouALL #Analytics #NEVERQuitting #CapturedByCAL ____________________________________ #RecordingArtist #LifestyleBlogger #BrandAmbassador… https://t.co/YBakT9K4dz | 2.243870e+08 | Cerritos, CA | c(34.4258, -119.714) | 0 | 0 | 34.4258 | -119.714 | 7 | 2018-07-29 |
In mid-2019, Twitter removed the ability to precisely identify your location in a tweet. I want to see the time frame for most of these tweets. If the majority are in the later half of 2019, it might be worth it to remove those tweets…
check_geo <- tweet_data %>%
filter(geo_coordinates == "c(34.4258, -119.714)") %>%
group_by(date) %>%
summarize(count = n())
ggplot(check_geo, aes(x = date, y = count)) +
geom_line() +
geom_smooth()
It doesn’t look like a significant dropoff in 2019, infact there are so few at the beginning of the time series I wonder if they implemented that default coordinate later on.
Look at the log of tweet count.
hex_tweet_count <- hex_grid %>%
mutate(tweet_count = lengths(st_intersects(hex_grid, tweet_sf)),
log_tweet_count = log(tweet_count),
bin = case_when(
tweet_count < 10 ~ 10,
tweet_count >= 10 & tweet_count < 50 ~ 50,
tweet_count >= 50 & tweet_count < 100 ~ 100,
tweet_count >= 100 & tweet_count < 500 ~ 500,
tweet_count >= 500 & tweet_count < 1000 ~ 1000,
tweet_count >= 1000 & tweet_count < 2000 ~ 2000,
tweet_count >= 2000 ~ 2001
))
log_hex_map <-mapview(hex_tweet_count %>% filter(tweet_count > 0), #remove hexes with no tweets
zcol = "log_tweet_count", layer.name = "Tweet count (log)")
log_hex_map